This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load required libraries
library(Lahman)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gapminder)
library(dplyr)
library(broom)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
head(People)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 aardsda01 1981 12 27 USA CO Denver
## 2 aaronha01 1934 2 5 USA AL Mobile
## 3 aaronto01 1939 8 5 USA AL Mobile
## 4 aasedo01 1954 9 8 USA CA Orange
## 5 abadan01 1972 8 25 USA FL Palm Beach
## 6 abadfe01 1985 12 17 D.R. La Romana La Romana
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 NA NA NA <NA> <NA> <NA> David
## 2 2021 1 22 USA GA Atlanta Hank
## 3 1984 8 16 USA GA Atlanta Tommie
## 4 NA NA NA <NA> <NA> <NA> Don
## 5 NA NA NA <NA> <NA> <NA> Andy
## 6 NA NA NA <NA> <NA> <NA> Fernando
## nameLast nameGiven weight height bats throws debut finalGame
## 1 Aardsma David Allan 215 75 R R 2004-04-06 2015-08-23
## 2 Aaron Henry Louis 180 72 R R 1954-04-13 1976-10-03
## 3 Aaron Tommie Lee 190 75 R R 1962-04-10 1971-09-26
## 4 Aase Donald William 190 75 R R 1977-07-26 1990-10-03
## 5 Abad Fausto Andres 184 73 L L 2001-09-10 2006-04-13
## 6 Abad Fernando Antonio 235 74 L L 2010-07-28 2021-10-01
## retroID bbrefID deathDate birthDate
## 1 aardd001 aardsda01 <NA> 1981-12-27
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05
## 3 aarot101 aaronto01 1984-08-16 1939-08-05
## 4 aased001 aasedo01 <NA> 1954-09-08
## 5 abada001 abadan01 <NA> 1972-08-25
## 6 abadf001 abadfe01 <NA> 1985-12-17
df <- select(People, birthYear, nameFirst)
df <- df %>%
group_by(birthYear, nameFirst) %>%
summarize(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'birthYear'. You can override using the
## `.groups` argument.
df <- df %>%
group_by(birthYear) %>%
mutate(prop = n / sum(n))
# Arrange the final tibble by birthYear and descending order of n
final_tibble <- df %>%
select(birthYear, nameFirst, n, prop) %>%
arrange(birthYear, desc(n))
# Output the final tibble
final_tibble
## # A tibble: 13,179 × 4
## # Groups: birthYear [171]
## birthYear nameFirst n prop
## <int> <chr> <int> <dbl>
## 1 1820 Alexander 1 1
## 2 1824 Henry 1 1
## 3 1832 Nate 1 0.5
## 4 1832 William 1 0.5
## 5 1835 Harry 1 1
## 6 1836 Dickey 1 1
## 7 1837 Morgan 1 1
## 8 1838 Bill 1 0.333
## 9 1838 Dave 1 0.333
## 10 1838 Lew 1 0.333
## # … with 13,169 more rows
# Create a subset of the tibble with first names that start with the letter "Y"
subset_y <- final_tibble %>%
filter(str_starts(nameFirst, "Y"))
subset_y
## # A tibble: 68 × 4
## # Groups: birthYear [35]
## birthYear nameFirst n prop
## <int> <chr> <int> <dbl>
## 1 1859 Yank 1 0.0115
## 2 1869 Yale 1 0.00971
## 3 1873 Youngy 1 0.0116
## 4 1886 Yip 1 0.00725
## 5 1892 Yam 1 0.00662
## 6 1903 Yats 1 0.00917
## 7 1911 Yank 1 0.0102
## 8 1925 Yogi 1 0.0106
## 9 1928 Yo-Yo 1 0.00909
## 10 1967 Yorkis 1 0.00529
## # … with 58 more rows
# Create a subset of the tibble with first names that contain at least three vowels
subset_vowels <- final_tibble %>%
filter(str_count(nameFirst, "[aeiou]") >= 3)
subset_vowels
## # A tibble: 1,588 × 4
## # Groups: birthYear [161]
## birthYear nameFirst n prop
## <int> <chr> <int> <dbl>
## 1 1820 Alexander 1 1
## 2 1832 William 1 0.5
## 3 1840 Charlie 1 0.143
## 4 1840 George 1 0.143
## 5 1840 Washington 1 0.143
## 6 1843 Charlie 1 0.125
## 7 1844 Charlie 1 0.0556
## 8 1844 Cherokee 1 0.0556
## 9 1844 George 1 0.0556
## 10 1845 Freeman 1 0.0556
## # … with 1,578 more rows
# Join Fielding and People data frames
player_totals <- Fielding %>%
inner_join(People, by = "playerID") %>%
# Select relevant variables
select(playerID, nameFirst, nameLast, nameGiven, G) %>%
# Group by playerID and nameFirst and nameLast
group_by(playerID, nameFirst, nameLast, nameGiven) %>%
# Summarize career total of games
summarise(G_career = sum(G), .groups = "drop") %>%
# Reorder columns
select(playerID, nameFirst, nameLast, nameGiven, G_career)
head(player_totals)
## # A tibble: 6 × 5
## playerID nameFirst nameLast nameGiven G_career
## <chr> <chr> <chr> <chr> <int>
## 1 aardsda01 David Aardsma David Allan 331
## 2 aaronha01 Hank Aaron Henry Louis 3020
## 3 aaronto01 Tommie Aaron Tommie Lee 387
## 4 aasedo01 Don Aase Donald William 448
## 5 abadan01 Andy Abad Fausto Andres 9
## 6 abadfe01 Fernando Abad Fernando Antonio 400
# add a new variable 'fullName' by combining first name and last name with a space
player_totals$fullName <- paste(player_totals$nameFirst, player_totals$nameLast, sep = " ")
# print first few rows of final data frame
head(player_totals)
## # A tibble: 6 × 6
## playerID nameFirst nameLast nameGiven G_career fullName
## <chr> <chr> <chr> <chr> <int> <chr>
## 1 aardsda01 David Aardsma David Allan 331 David Aardsma
## 2 aaronha01 Hank Aaron Henry Louis 3020 Hank Aaron
## 3 aaronto01 Tommie Aaron Tommie Lee 387 Tommie Aaron
## 4 aasedo01 Don Aase Donald William 448 Don Aase
## 5 abadan01 Andy Abad Fausto Andres 9 Andy Abad
## 6 abadfe01 Fernando Abad Fernando Antonio 400 Fernando Abad
# Filter players who played at least 500 games
player_totals_filtered <- player_totals %>%
filter(G_career >= 500)
# Group by first name and count the number of players for each name
top_first_names <- player_totals_filtered %>%
group_by(nameFirst) %>%
summarise(n_players = n()) %>%
arrange(desc(n_players)) %>%
head(5)
# View the result
top_first_names
## # A tibble: 5 × 2
## nameFirst n_players
## <chr> <int>
## 1 Mike 80
## 2 Joe 60
## 3 John 56
## 4 Bill 55
## 5 Jim 52
1.Get the data in a single data frame Create 3 data frames (or tibbles) from these files Combine the 3 data frames into one.
url1 <- "https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Fellowship_Of_The_Ring.csv"
url2 <- "https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Two_Towers.csv"
url3 <- "https://raw.githubusercontent.com/jennybc/lotr-tidy/master/data/The_Return_Of_The_King.csv"
# Read the data from URLs
lotr1 <- read.csv(url1) %>% mutate(Film = "The Fellowship Of The Ring")
lotr2 <- read.csv(url2) %>% mutate(Film = "The Two Towers")
lotr3 <- read.csv(url3) %>% mutate(Film = "The Return Of The King")
# Combine the 3 data frames into one
lotr_combined <- bind_rows(lotr1, lotr2, lotr3)
lotr_combined
## Film Race Female Male
## 1 The Fellowship Of The Ring Elf 1229 971
## 2 The Fellowship Of The Ring Hobbit 14 3644
## 3 The Fellowship Of The Ring Man 0 1995
## 4 The Two Towers Elf 331 513
## 5 The Two Towers Hobbit 0 2463
## 6 The Two Towers Man 401 3589
## 7 The Return Of The King Elf 183 510
## 8 The Return Of The King Hobbit 2 2673
## 9 The Return Of The King Man 268 2459
2.Tidy the combined data frame by creating new variables “Gender” and “Words”
# Extract gender from the character column
lotr_tidy <- lotr_combined %>%
pivot_longer(cols = c("Female", "Male"), names_to = "Gender", values_to = "Words")
lotr_tidy
## # A tibble: 18 × 4
## Film Race Gender Words
## <chr> <chr> <chr> <int>
## 1 The Fellowship Of The Ring Elf Female 1229
## 2 The Fellowship Of The Ring Elf Male 971
## 3 The Fellowship Of The Ring Hobbit Female 14
## 4 The Fellowship Of The Ring Hobbit Male 3644
## 5 The Fellowship Of The Ring Man Female 0
## 6 The Fellowship Of The Ring Man Male 1995
## 7 The Two Towers Elf Female 331
## 8 The Two Towers Elf Male 513
## 9 The Two Towers Hobbit Female 0
## 10 The Two Towers Hobbit Male 2463
## 11 The Two Towers Man Female 401
## 12 The Two Towers Man Male 3589
## 13 The Return Of The King Elf Female 183
## 14 The Return Of The King Elf Male 510
## 15 The Return Of The King Hobbit Female 2
## 16 The Return Of The King Hobbit Male 2673
## 17 The Return Of The King Man Female 268
## 18 The Return Of The King Man Male 2459
# 3a. How many words were spoken in each movie?
words_by_movie <- lotr_tidy %>%
group_by(Film) %>%
summarise(total_words = sum(Words))
words_by_movie
## # A tibble: 3 × 2
## Film total_words
## <chr> <int>
## 1 The Fellowship Of The Ring 7853
## 2 The Return Of The King 6095
## 3 The Two Towers 7297
# 3b. How many words were spoken by each gender in total?
words_by_gender <- lotr_tidy %>%
group_by(Gender) %>%
summarise(total_words = sum(Words))
words_by_gender
## # A tibble: 2 × 2
## Gender total_words
## <chr> <int>
## 1 Female 2428
## 2 Male 18817
# 3c. How many words were spoken by each race in total?
words_by_race <- lotr_tidy %>%
group_by(Race) %>%
summarise(total_words = sum(Words))
words_by_race
## # A tibble: 3 × 2
## Race total_words
## <chr> <int>
## 1 Elf 3737
## 2 Hobbit 8796
## 3 Man 8712
by_race_film <- lotr_tidy %>%
group_by(Race, Film) %>%
summarise(total_words = sum(Words),.groups = "drop")
by_race_film
## # A tibble: 9 × 3
## Race Film total_words
## <chr> <chr> <int>
## 1 Elf The Fellowship Of The Ring 2200
## 2 Elf The Return Of The King 693
## 3 Elf The Two Towers 844
## 4 Hobbit The Fellowship Of The Ring 3658
## 5 Hobbit The Return Of The King 2675
## 6 Hobbit The Two Towers 2463
## 7 Man The Fellowship Of The Ring 1995
## 8 Man The Return Of The King 2727
## 9 Man The Two Towers 3990
Split/group the gapminder data by year. For each year, fit a regression of life expectancy on log10(gdpPercap), obtain clean outputs on coefficients in broom, and combine the slope coefficients them into a single tibble like the following:
# Load the gapminder dataset
data(gapminder)
# Group by year and fit a regression for each year
reg_results <- gapminder %>%
group_by(year) %>%
do(tidy(lm(lifeExp ~ log10(gdpPercap), data = .)))
# Extract the slope coefficients and combine them into a single tibble
slope_coefs <- reg_results %>%
filter(term == "log10(gdpPercap)") %>%
select(year, estimate, std.error, statistic, p.value)
# Print the slope coefficients tibble
slope_coefs
## # A tibble: 12 × 5
## # Groups: year [12]
## year estimate std.error statistic p.value
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1952 20.3 1.53 13.3 1.12e-26
## 2 1957 20.1 1.46 13.8 7.55e-28
## 3 1962 19.8 1.38 14.3 3.62e-29
## 4 1967 18.5 1.29 14.4 1.86e-29
## 5 1972 17.5 1.15 15.2 1.82e-31
## 6 1977 17.4 1.05 16.6 9.04e-35
## 7 1982 17.3 0.919 18.8 4.46e-40
## 8 1987 17.0 0.796 21.3 8.25e-46
## 9 1992 17.3 0.885 19.6 6.55e-42
## 10 1997 17.5 0.863 20.3 1.46e-43
## 11 2002 17.5 1.01 17.3 1.55e-36
## 12 2007 16.6 1.02 16.3 4.12e-34
# Plot the slope coefficients against year
plot <- ggplot(slope_coefs, aes(x = year, y = estimate)) +
geom_line() +
labs(x = "Year", y = "Slope coefficient")
ggplotly()
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.